Today, we will work with daily water temperature and air temperature data observed for 31 rivers in Spain. The goal of this tutorial is to identify the best model for predicting the maximum water temperature given the maximum air temperature. In the preview below, W represents the daily maximum water temperature and A represents the daily maximum air temperature. The data contains almost a full year of data for each of the 31 different rivers.
## # A tibble: 6 x 8
## JULIAN_DAY YEAR L W A TIME MONTH DAY
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2003 103 14.2 21.2 1 1 1
## 2 2 2003 103 14.4 16.8 2 1 2
## 3 3 2003 103 14.4 15.4 3 1 3
## 4 4 2003 103 10.9 10.8 4 1 4
## 5 5 2003 103 10.8 11.7 5 1 5
## 6 6 2003 103 10.7 12.4 6 1 6
Please modify the code chunk to visualize the relationship between A and W with scatterplot (set alpha=0.3) and smooth line.
ggplot(data=DATA) +
geom_point(aes(x=A,y=W),alpha=0.3) +
geom_smooth(aes(x=A,y=W)) +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Do You Think This Relationship is the Same for All Locations? Please write a function to visualize the relationship between A and W with scatterplot (set alpha=0.3) and smooth line for a specified Location L.
Test your function to see if it works:
WAPLOT.func(103)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
WAPLOT.func(105)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
WAPLOT.func(918)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
set.seed(216)
TEST.LOCATIONS=sample(x=unique(DATA$L),size=3,replace=F)
TRAIN = anti_join(DATA,tibble(L=TEST.LOCATIONS),by="L")
TEST = semi_join(DATA,tibble(L=TEST.LOCATIONS),by="L")
Please write a function to visualize the relationship between A and W with scatterplot (set alpha=0.3) and smooth line for a specified Dataset.
WAPLOT2.func=function(DATA){
ggplot(data=DATA) +
geom_point(aes(x=A,y=W),alpha=0.3) +
geom_smooth(aes(x=A,y=W)) +
theme_minimal()
}
Test the function:
WAPLOT2.func(TRAIN)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
WAPLOT2.func(TEST)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Please complete the code chunk to fit a linear regression model on the training set. (Formula: W~A)
linmod=lm(W~A, data=TRAIN)
summary(linmod)
##
## Call:
## lm(formula = W ~ A, data = TRAIN)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.1495 -2.1024 -0.1857 1.8851 16.8637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.394990 0.087161 38.95 <0.0000000000000002 ***
## A 0.649422 0.004101 158.36 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.075 on 8866 degrees of freedom
## (1371 observations deleted due to missingness)
## Multiple R-squared: 0.7388, Adjusted R-squared: 0.7388
## F-statistic: 2.508e+04 on 1 and 8866 DF, p-value: < 0.00000000000000022
TRAIN2 = TRAIN %>% add_predictions(linmod,var="linpred")
TEST2 = TEST %>% add_predictions(linmod,var="linpred")
TRAIN3 = TRAIN2 %>% add_residuals(linmod,var="linres")
TEST3 = TEST2 %>% add_residuals(linmod,var="linres")
poly2mod=lm(W~A+I(A^2),data=TRAIN)
poly3mod=lm(W~A+I(A^2)+I(A^3),data=TRAIN)
poly4mod=lm(W~A+I(A^2)+I(A^3)+I(A^4),data=TRAIN)
anova(linmod,poly2mod,poly3mod,poly4mod,test="Chisq")
## Analysis of Variance Table
##
## Model 1: W ~ A
## Model 2: W ~ A + I(A^2)
## Model 3: W ~ A + I(A^2) + I(A^3)
## Model 4: W ~ A + I(A^2) + I(A^3) + I(A^4)
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 8866 83855
## 2 8865 83553 1 302.01 0.00000001284 ***
## 3 8864 82840 1 713.09 < 0.00000000000000022 ***
## 4 8863 82730 1 109.48 0.0006152 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Please add the predictions of polynomial regression models to the training and testing sets with column names poly2pred, poly3pred and poly4pred.
TRAIN4 =TRAIN3 %>%
add_predictions(poly2mod,var='poly2pred') %>%
add_predictions(poly3mod,var='poly3pred') %>%
add_predictions(poly4mod,var='poly4pred')
TEST4 =TEST3 %>%
add_predictions(poly2mod,var='poly2pred') %>%
add_predictions(poly3mod,var='poly3pred') %>%
add_predictions(poly4mod,var='poly4pred')
Please add the residuals of polynomial regression models to the training and testing sets with column names poly2res, poly3res and poly4res.
TRAIN5 =TRAIN4 %>%
add_residuals(poly2mod,var='poly2res') %>%
add_residuals(poly3mod,var='poly3res') %>%
add_residuals(poly4mod,var='poly4res')
TEST5 =TEST4 %>%
add_residuals(poly2mod,var='poly2res') %>%
add_residuals(poly3mod,var='poly3res') %>%
add_residuals(poly4mod,var='poly4res')
\[y=d+\frac{a}{1+e^{(b-cx)}}\]
Please write a function to calculate mean squared error for the nonlinear logistic model.
logistic.model=function(COEF,DATA){
pred=COEF[1]+COEF[2]/(1+exp(COEF[3]-COEF[4]*DATA$A))
}
MSE.logistic=function(COEF,DATA){
residuals = DATA$W - logistic.model(COEF,DATA)
mse = mean(residuals^2,na.rm = T)
return(mse)
}
logistic.mod=optim(
par=c(min(TRAIN$W,na.rm=T),
max(TRAIN$W,na.rm=T)-min(TRAIN$W,na.rm=T),
median(TRAIN$A,na.rm=T),
1), #Smart Starting Values
fn=MSE.logistic, #Function to Minimize
DATA=TRAIN #Required Argument
)
print(logistic.mod)
## $par
## [1] 12.282190 36.386633 5.026564 0.136390
##
## $value
## [1] 13.50391
##
## $counts
## function gradient
## 501 NA
##
## $convergence
## [1] 1
##
## $message
## NULL
TRAIN6=TRAIN5 %>% mutate(
logpred=logistic.model(COEF=logistic.mod$par,DATA=TRAIN5),
logres=W-logpred)
TEST6=TEST5 %>% mutate(logpred=logistic.model(
COEF=logistic.mod$par,DATA=TEST5),logres=W-logpred)
The function save.image() in R can be used to save all objects in the global environment. This is very helpful when you want work off your results without rerunning all previous R code. The name of the exported information should contain the file extension .Rdata. These files can be extremely large depending how much RAM was utilized in your R session. The function load() can be used to import a previous workspace.
For more information on .Rdata file types, see https://fileinfo.com/extension/rdata for help.
save.image("Tutorial.Rdata")
load("PATH_TO_THE_FILE")
TEST6 %>%
select(L,A,W,linpred,poly2pred,poly3pred,poly4pred,logpred)%>%
gather(linpred:logpred,key="Model",value="Pred",factor_key=T) %>%
ggplot() +
geom_point(aes(x=A,y=W),color="gray") +
theme_minimal() +
geom_line(aes(x=A,y=Pred,color=Model),size=2)
TEST6 %>%
select(L,A,W,linpred,poly2pred,poly3pred,poly4pred,logpred)%>%
gather(linpred:logpred,key="Model",value="Pred",factor_key=T) %>%
ggplot() +
geom_point(aes(x=W,y=Pred,color=Model)) +
geom_abline(a=0,b=1,color="black",size=2) +
xlab("Maximum Water Temperature") +
ylab("Predicted Water Temperature") +
theme_minimal()
TEST6 %>%
select(L,A,W,TIME,linres,poly2res,poly3res,poly4res,logres)%>%
gather(linres:logres,key="Model",value="Res",factor_key=T) %>%
ggplot() +
geom_line(aes(x=TIME,y=Res),color="lightskyblue2") +
geom_hline(yintercept=0,color="black",linetype=2,size=1.5) +
xlab("Time") +
ylab("Residual") +
theme_dark() +
facet_grid(Model~.)
TEST6 %>%
select(L,A,W,TIME,linpred,logpred) %>%
gather(linpred:logpred,key="Model",value="Pred",factor_key=T) %>%
ggplot() +
geom_point(aes(x=A,y=W),alpha=0.2) +
geom_line(aes(x=A,y=Pred,color=Model),size=2) +
theme_minimal()+facet_grid(L~.)
TEST6 %>%
select(L,A,W,TIME,linres,logres) %>%
gather(linres:logres,key="Model",value="Res",factor_key=T) %>%
ggplot() +
geom_point(aes(x=A,y=Res,color=Model)) +
geom_hline(yintercept=0) +
theme_minimal()+facet_grid(L~.)
TEST6 %>%
select(L,A,W,TIME,linres,logres) %>%
gather(linres:logres,key="Model",value="Res",factor_key=T) %>%
ggplot() +
geom_point(aes(x=TIME,y=Res,color=Model)) +
geom_hline(yintercept=0) +
theme_minimal()+facet_grid(L~.)
bias.func=function(res){
bias=mean(res,na.rm=T)
return(bias)
}
mae.func=function(res){
mae=mean(abs(res),na.rm=T)
return(mae)
}
rmse.func=function(res){
mse=mean(res^2,na.rm=T)
rmse=sqrt(mse)
return(rmse)
}
Please use apply function to calculate Bias, MAE and rMSE for the models.
ex.res.mat=TEST6 %>% select(linres,poly2res,poly3res,poly4res,logres)
apply(ex.res.mat,2,bias.func)
## linres poly2res poly3res poly4res logres
## 0.9534126 0.9742415 0.9903951 0.9920042 0.2613184
apply(ex.res.mat,2,mae.func)
## linres poly2res poly3res poly4res logres
## 2.750323 2.732399 2.706833 2.715366 3.135313
apply(ex.res.mat,2,rmse.func)
## linres poly2res poly3res poly4res logres
## 3.351594 3.344867 3.328889 3.338710 3.711664
SUMM1=TEST6 %>%
select(L,A,W,TIME,linres,poly2res,poly3res,poly4res,logres)%>%
rename(Linear=linres,`Poly(2)`=poly2res,`Poly(3)`=poly3res,`Poly(4)`=poly4res,Logistic=logres)%>%
gather(Linear:Logistic,key="Model",value="Residual",factor_key=T)
SUMM2= SUMM1 %>%
group_by(Model) %>%
summarize(MB=bias.func(Residual),
MAE=mae.func(Residual),
RMSE=rmse.func(Residual))
## `summarise()` ungrouping output (override with `.groups` argument)
print(SUMM2)
## # A tibble: 5 x 4
## Model MB MAE RMSE
## <fct> <dbl> <dbl> <dbl>
## 1 Linear 0.953 2.75 3.35
## 2 Poly(2) 0.974 2.73 3.34
## 3 Poly(3) 0.990 2.71 3.33
## 4 Poly(4) 0.992 2.72 3.34
## 5 Logistic 0.261 3.14 3.71
## % latex table generated in R 4.0.2 by xtable 1.8-4 package
## % Tue Oct 13 13:05:11 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{llrrr}
## \hline
## & Model & MB & MAE & RMSE \\
## \hline
## 1 & Linear & 0.9534 & 2.7503 & 3.3516 \\
## 2 & Poly(2) & 0.9742 & 2.7324 & 3.3449 \\
## 3 & Poly(3) & 0.9904 & 2.7068 & 3.3289 \\
## 4 & Poly(4) & 0.9920 & 2.7154 & 3.3387 \\
## 5 & Logistic & 0.2613 & 3.1353 & 3.7117 \\
## \hline
## \end{tabular}
## \end{table}